library(dplyr)
library(RMySQL)
library(tidyr)
library(ggplot2)
library(magrittr)
source('../../shiny_apps/data_loading.R')

Load in Wordbank common data.

wordbank <- connect.to.wordbank("prod")

common.tables <- get.common.tables(wordbank)

admins <- get.administration.data(common.tables$momed,
                                  common.tables$child,
                                  common.tables$instrumentsmap,
                                  common.tables$administration)

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap,
                       common.tables$category)

Get by-child productive vocabulary data.

num.words <- items %>%
  filter(type == "word") %>%
  group_by(language, form) %>%
  summarise(n.words = n())

vocab.admins <- admins %>%
  select(data_id, language, form, age, sex, comprehension, production) %>%
  filter(!is.na(sex), sex != "")

vocab.data <- vocab.admins %>%
  group_by(language, form, sex, age) %>%
  left_join(num.words) %>%
  mutate(comprehension = as.numeric(comprehension)/n.words,
         production = as.numeric(production)/n.words) %>%
  gather(measure, value, comprehension, production) %>%
  filter(form == "WG" | (form == "WS" & measure == "production"))
  
vocab.sample.sizes <- vocab.admins %>%
  group_by(language, form) %>%
  summarise(n.admins = length(unique(data_id)))

vocab.data %<>% left_join(vocab.sample.sizes) %>%
  filter(n.admins >= 100) %>%
  mutate(label = sprintf("%s %s (n = %s)", language, form, n.admins))

Plot WS vocabulary size across age.

ggplot(filter(vocab.data, form == "WS"), aes(x = age, y = value, color = sex)) +
  facet_wrap(~ language) +
  geom_jitter(size = 0.7) +
  scale_color_brewer(palette = "Set1", name = "Sex") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Vocabulary Size\n") +
  xlab("\nAge (months)") +
  theme_bw(base_size = 12) +
  theme(text = element_text(family = font))

Plot WG vocabulary size across age.

ggplot(filter(vocab.data, form == "WG"), aes(x = age, y = value, color = sex)) +
  facet_grid(language ~ measure) +
  geom_jitter(size = 0.7) +
  scale_color_brewer(palette = "Set1", name = "Sex") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Vocabulary Size\n") +
  xlab("\nAge (months)") +
  theme_bw(base_size = 12) +
  theme(text = element_text(family = font))

Plot WS vocabulary size across age with model fit curves.

ggplot(filter(vocab.data, form == "WS"), aes(x = age, y = value, color = sex)) +
  facet_wrap(~ language) +
  geom_smooth(method = "lm", formula = y ~ I(x^3) + I(x^2) + x) +
  scale_color_brewer(palette = "Set1", name = "Sex") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Vocabulary Size\n") +
  xlab("\nAge (months)") +
  theme_bw(base_size = 12) +
  theme(text = element_text(family = font))

Plot WG vocabulary size across age with model fit curves.

ggplot(filter(vocab.data, form == "WG"), aes(x = age, y = value, color = sex)) +
  facet_grid(language ~ measure) +
  geom_smooth(method = "lm", formula = y ~ I(x^3) + I(x^2) + x) +
  scale_color_brewer(palette = "Set1", name = "Sex") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Vocabulary Size\n") +
  xlab("\nAge (months)") +
  theme_bw(base_size = 12) +
  theme(text = element_text(family = font))

Function for resampling data.

sample.diffs <- function(d, num.times) {
  
  get.diff <- function(group_data) {
    pts <- seq(min(group_data$age), max(group_data$age), 1)
    male <- predict(lm(value ~ I(age^3) + I(age^2) + I(age),
                       data = filter(group_data, sex == "M")),
                    newdata = data.frame(age = pts))
    female <- predict(lm(value ~ I(age^3) + I(age^2) + I(age),
                         data = filter(group_data, sex == "F")),
                    newdata = data.frame(age = pts))
    mean(female - male)
  }
    
  counter = 1
  sample.diff <- function(d) {
    d.frame <- d %>%
      group_by(language, form, measure) %>%
      sample_frac(replace = TRUE) %>% # resample kids
      do(diff = get.diff(.)) %>%
      mutate(diff = diff[1]) %>%
      rename_(.dots = setNames("diff", counter))

    counter <<- counter + 1 # increment counter outside scope
    return(d.frame)
  }

  diffs <- replicate(num.times, sample.diff(d), simplify=FALSE)
  
  Reduce(left_join, diffs) %>%
    gather(sample, diff, -language, -form, -measure)
}

Resample data, compute difference in vocabulary size between sexes for each sample, find the mean and CI of the difference estimate.

diffs <- sample.diffs(vocab.data, 100)

diffs.summary <- diffs %>%
  group_by(language, form, measure) %>%
  summarise(mean =  mean(diff, na.rm = TRUE),
            ci.high = quantile(diff, 0.975, na.rm = TRUE),
            ci.low = quantile(diff, 0.025, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(language = factor(language))

Plot mean difference by language, form, and measure.

ggplot(diffs.summary, aes(x = mean, y = language)) +
  facet_grid(~ form + measure) +
  geom_point(color = "steelblue") +
  geom_segment(aes(x = ci.low, xend = ci.high,
                   y = language, yend = language),
               color = "steelblue") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  theme_bw() +
  theme(text = element_text(family = font)) +
  xlab("\nMean gender difference in vocabulary size across age") +
  ylab("")